{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import Any, Optional\n",
    "\n",
    "from sqlalchemy import and_, func, select\n",
    "from sqlalchemy.sql.elements import BinaryExpression\n",
    "from sqlalchemy.types import Unicode\n",
    "\n",
    "from phoenix.db import models\n",
    "from phoenix.db.engines import create_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "baseline_experiment_id = 218\n",
    "compare_experiment_ids = [217]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "engine = create_engine(\"sqlite:////Users/xandersong/.phoenix/phoenix.db\")\n",
    "\n",
    "\n",
    "async def print_experiment_runs(\n",
    "    baseline_experiment_id: int,\n",
    "    compare_experiment_ids: list[int],\n",
    "    filter_condition: Optional[BinaryExpression[Any]] = None,\n",
    ") -> None:\n",
    "    async with engine.connect() as conn:\n",
    "        baseline_experiment = await conn.execute(\n",
    "            select(models.Experiment).where(models.Experiment.id == baseline_experiment_id)\n",
    "        )\n",
    "        baseline_experiment = (\n",
    "            await conn.execute(\n",
    "                select(models.Experiment).where(models.Experiment.id == baseline_experiment_id)\n",
    "            )\n",
    "        ).first()\n",
    "        assert baseline_experiment is not None\n",
    "        dataset_id = baseline_experiment.dataset_id\n",
    "        version_id = baseline_experiment.dataset_version_id\n",
    "\n",
    "        revision_ids = (\n",
    "            select(func.max(models.DatasetExampleRevision.id))\n",
    "            .join(\n",
    "                models.DatasetExample,\n",
    "                models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,\n",
    "            )\n",
    "            .where(\n",
    "                and_(\n",
    "                    models.DatasetExampleRevision.dataset_version_id <= version_id,\n",
    "                    models.DatasetExample.dataset_id == dataset_id,\n",
    "                )\n",
    "            )\n",
    "            .group_by(models.DatasetExampleRevision.dataset_example_id)\n",
    "            .scalar_subquery()\n",
    "        )\n",
    "        examples = (\n",
    "            select(models.DatasetExample)\n",
    "            .join(\n",
    "                models.DatasetExampleRevision,\n",
    "                models.DatasetExample.id == models.DatasetExampleRevision.dataset_example_id,\n",
    "            )\n",
    "            .join(\n",
    "                models.ExperimentRun,\n",
    "                onclause=models.ExperimentRun.dataset_example_id == models.DatasetExample.id,\n",
    "            )\n",
    "            .join(\n",
    "                models.ExperimentRunAnnotation,\n",
    "                onclause=models.ExperimentRunAnnotation.experiment_run_id\n",
    "                == models.ExperimentRun.id,\n",
    "                isouter=True,\n",
    "            )\n",
    "            .where(\n",
    "                and_(\n",
    "                    models.DatasetExampleRevision.id.in_(revision_ids),\n",
    "                    models.DatasetExampleRevision.revision_kind != \"DELETE\",\n",
    "                )\n",
    "            )\n",
    "            .order_by(models.DatasetExampleRevision.dataset_example_id.desc())\n",
    "        )\n",
    "        if filter_condition is not None:\n",
    "            examples = examples.where(filter_condition)\n",
    "        print(examples.compile(compile_kwargs={\"literal_binds\": True}))\n",
    "        results = await conn.execute(examples)\n",
    "        for result in results:\n",
    "            print(result.id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].error is None`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRun.error.is_(None),\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[1].error is None`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRun.error.is_(None),\n",
    "        models.ExperimentRun.experiment_id == compare_experiment_ids[0],\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].latency_ms > 1000`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRun.latency_ms < 5000,\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].evals[\"judged_correct\"] == \"incorrect\"`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRunAnnotation.name == \"judged_correct\",\n",
    "        models.ExperimentRunAnnotation.label == \"incorrect\",\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].evals[\"matches_expected\"] <= 0.5`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRunAnnotation.name == \"matches_expected\",\n",
    "        models.ExperimentRunAnnotation.score <= 0.5,\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`\"specifications\" in experiments[0].input[\"question\"]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.DatasetExampleRevision.input[\"question\"].cast(Unicode).contains(\"specifications\"),\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].input[\"question\"] == \"Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?\"`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.DatasetExampleRevision.input[\"question\"].cast(Unicode)\n",
    "        == \"Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?\",\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`experiments[0].reference_output[\"question\"] == \"Can you give me the specifications of the Samsung Galaxy S21? Also, what are similar options?\"`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.DatasetExampleRevision.output[\"expected_tool_calls\"]\n",
    "        .cast(Unicode)\n",
    "        .contains(\"product_details\"),\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`\"specifications\" in experiments[0].output[\"messages\"][0][\"content\"]`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "await print_experiment_runs(\n",
    "    baseline_experiment_id,\n",
    "    compare_experiment_ids,\n",
    "    and_(\n",
    "        models.ExperimentRun.output[\"task_output\"][\"messages\"][0][\"content\"]\n",
    "        .cast(Unicode)\n",
    "        .contains(\"Certainly\"),\n",
    "        models.ExperimentRun.experiment_id == baseline_experiment_id,\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "async with engine.connect() as conn:\n",
    "    results = await conn.execute(\n",
    "        select(models.ExperimentRun.output[\"task_output\"][\"messages\"][0][\"content\"].label(\"value\")),\n",
    "    )\n",
    "    for result in results:\n",
    "        if result.value:\n",
    "            print(result.value)"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
